import sqlite3
import pandas as pd
from datetime import datetime
import pytz
from multiprocessing import Pool, cpu_count
import re

# Database files
databases = [
    "202001_202212_02_mx_youtube_data.db",
    "202301_202312_02_mx_youtube_data.db",
    "202401_202405_02_mx_youtube_data.db",
    "202405_02_mx_youtube_data.db"
]

# Initialize empty dataframe
comments_2020_24 = pd.DataFrame()

# Loop through databases and extract comments
for db in databases:
    print(db)
    # Connect to database
    con = sqlite3.connect(db)
    
    # Read comments
    comments = pd.read_sql_query(
        "SELECT comment_id, text_original, published_at FROM comments",
        con
    )
    
    # Append to main dataframe
    comments_2020_24 = pd.concat([comments_2020_24, comments], ignore_index=True)
    
    # Close connection
    con.close()

# Convert published_at to datetime (UTC)
comments_2020_24['datetime'] = pd.to_datetime(
    comments_2020_24['published_at'], 
    utc=True
)

# Convert to Mexico City timezone
mexico_tz = pytz.timezone('America/Mexico_City')
comments_2020_24['datetime_mx'] = comments_2020_24['datetime'].dt.tz_convert(mexico_tz)

# Word count function
def wc(text):
    """Count words in a string by splitting on whitespace"""
    if pd.isna(text):
        return 0
    return len(text.split())

# Parallel word count using multiprocessing
if __name__ == '__main__':
    text_original = comments_2020_24['text_original'].tolist()
    
    # Use multiprocessing Pool for parallel processing
    with Pool(cpu_count()) as pool:
        word_counts = pool.map(wc, text_original)
    
    comments_2020_24['wc'] = word_counts
    
    # Save to CSV
    comments_2020_24.to_csv('comments_2020_24_wt_wc.csv', index=False)
    
    print("Processing complete! File saved as comments_2020_24_wt_wc.csv")
